#load necessary libraries
options(warn=-1)
library(astsa) #provides time series model functions (arima, sarima)
library(TSstudio) #plotly library for plotting time series data
library(xts) #library to convert data frame to time series object
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:astsa':
##
## gas
ts_plot(JohnsonJohnson,title='Quaterly earnings of J&J share',
Xtitle='Year',Ytitle='Share price')
ts_decompose(JohnsonJohnson)
It can be clearly seen that there is both trend and seasonality in the data.
In order to remove the trend, we use the difference operator on the share price. Specifically, a new series is constructed where the value at the current time step is calculated as the difference between the original observation and the observation at the previous time step.
#1. Log-return transformation
ts_plot(diff(log(JohnsonJohnson)), title ='Log-return transformation i.e. differencing') #d=1
acf(diff(log(JohnsonJohnson)),main="ACF") #there is seasonality after every 4 lags
pacf(diff(log(JohnsonJohnson)),main='pacf')
It can be observed that the trend has been removed upto a great extent.
#3. Ljung Box test
Box.test(diff(diff(JohnsonJohnson)),lag=log(length(JohnsonJohnson)))
##
## Box-Pierce test
##
## data: diff(diff(JohnsonJohnson))
## X-squared = 128.99, df = 4.4308, p-value < 2.2e-16
par(mfrow=c(2,1))
acf(diff(diff(log(JohnsonJohnson)),4))
pacf(diff(diff(log(JohnsonJohnson)),4))
p-value is less than 0.05 and hence, autocorrelation still exists between the lags and cannot be eliminated/reduced further.
d=1 #since we have differenced the Time Series once
D=1
s=4 #duration
for(p in 1:2)
{for(q in 1:2)
{for(P in 1:2)
{for(Q in 1:2)
{if(p+d+q+P+D+Q<=10)
{model<-arima(x=log(JohnsonJohnson),order=c(p-1,d,q-1),seasonal=list(order=c(P-1,D,Q-1),period=s))
test<-Box.test(model$residuals,lag=log(length(model$residuals)))
sse=sum(model$residuals^2)
cat(p-1,d,q-1,P-1,D,Q-1, "AIC:",model$aic,"SSE:",sse,'p-value:',test$p.value,'\n')
}
}
}
}
}
## 0 1 0 0 1 0 AIC: -124.0685 SSE: 0.9377872 p-value: 0.0002610792
## 0 1 0 0 1 1 AIC: -126.3493 SSE: 0.8856995 p-value: 0.0001606501
## 0 1 0 1 1 0 AIC: -125.9198 SSE: 0.8908546 p-value: 0.0001978113
## 0 1 0 1 1 1 AIC: -124.3648 SSE: 0.8854555 p-value: 0.0001574029
## 0 1 1 0 1 0 AIC: -145.5139 SSE: 0.6891989 p-value: 0.03543717
## 0 1 1 0 1 1 AIC: -150.7528 SSE: 0.6265214 p-value: 0.6089542
## 0 1 1 1 1 0 AIC: -150.9134 SSE: 0.6251635 p-value: 0.7079173
## 0 1 1 1 1 1 AIC: -149.1317 SSE: 0.6232876 p-value: 0.6780876
## 1 1 0 0 1 0 AIC: -139.8248 SSE: 0.7467495 p-value: 0.03503386
## 1 1 0 0 1 1 AIC: -146.0191 SSE: 0.6692692 p-value: 0.5400206
## 1 1 0 1 1 0 AIC: -146.0319 SSE: 0.6689661 p-value: 0.5612965
## 1 1 0 1 1 1 AIC: -144.3766 SSE: 0.6658382 p-value: 0.5459446
## 1 1 1 0 1 0 AIC: -145.8284 SSE: 0.667109 p-value: 0.2200492
## 1 1 1 0 1 1 AIC: -148.7706 SSE: 0.6263678 p-value: 0.594822
## 1 1 1 1 1 0 AIC: -148.9175 SSE: 0.6251104 p-value: 0.7195471
## 1 1 1 1 1 1 AIC: -144.4483 SSE: 0.6097742 p-value: 0.3002703
#hence we chose the model (0,1,1,1,1,0)
The model with the lowest AIC is (0,1,1,1,1,0) and one of the highest p-value.
#5. plotting seasonal arima of the model
sarima(log(JohnsonJohnson),0,1,1,1,1,0,4)
## initial value -2.237259
## iter 2 value -2.429075
## iter 3 value -2.446737
## iter 4 value -2.455821
## iter 5 value -2.459761
## iter 6 value -2.462511
## iter 7 value -2.462602
## iter 8 value -2.462749
## iter 9 value -2.462749
## iter 9 value -2.462749
## iter 9 value -2.462749
## final value -2.462749
## converged
## initial value -2.411490
## iter 2 value -2.412022
## iter 3 value -2.412060
## iter 4 value -2.412062
## iter 4 value -2.412062
## iter 4 value -2.412062
## final value -2.412062
## converged
## $fit
##
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S),
## include.mean = !no.constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 sar1
## -0.6796 -0.3220
## s.e. 0.0969 0.1124
##
## sigma^2 estimated as 0.007913: log likelihood = 78.46, aic = -150.91
##
## $degrees_of_freedom
## [1] 77
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.6796 0.0969 -7.0104 0.0000
## sar1 -0.3220 0.1124 -2.8641 0.0054
##
## $AIC
## [1] -1.910297
##
## $AICc
## [1] -1.908298
##
## $BIC
## [1] -1.820318
#acf plot shows that there is no autocorrelation between previous lags since no spike is above the noise line
#p-value verified that there is no autocorrelation
#Q-Q plot for residuals is almost linear
#6. forecasting
model.fit<-arima(x=(JohnsonJohnson),order=c(0,1,1),seasonal=list(order=c(1,1,0),period=4))
print(model.fit)
##
## Call:
## arima(x = (JohnsonJohnson), order = c(0, 1, 1), seasonal = list(order = c(1,
## 1, 0), period = 4))
##
## Coefficients:
## ma1 sar1
## -0.7835 0.1382
## s.e. 0.0674 0.1167
##
## sigma^2 estimated as 0.1919: log likelihood = -47.36, aic = 100.71
The AIC of the above model is somewhat less. Plotting the results of the forecast. # Forecasting {.tabset}
plot(forecast(model.fit))
forecast(model.fit)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1981 Q1 17.87703 17.31558 18.43847 17.01837 18.73568
## 1981 Q2 16.28482 15.71037 16.85926 15.40628 17.16336
## 1981 Q3 17.56016 16.97300 18.14733 16.66218 18.45815
## 1981 Q4 13.21237 12.61276 13.81198 12.29535 14.12940
## 1982 Q1 19.48728 18.51876 20.45581 18.00606 20.96851
## 1982 Q2 17.88647 16.88369 18.88926 16.35285 19.42010
## 1982 Q3 19.15150 18.11559 20.18741 17.56722 20.73579
## 1982 Q4 14.81231 13.74430 15.88032 13.17893 16.44569
We can see that the model has captured the trend and seasonality in the data and predicted on similar lines.